import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as offline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import statistics, math
from scipy.stats import shapiro
%matplotlib inline
offline.init_notebook_mode()
df_train = pd.read_csv("train_u6lujuX_CVtuZ9i.csv",index_col="Loan_ID")
df_test = pd.read_csv("test_Y3wMUE5_7gLdaTN.csv",index_col="Loan_ID")
df_train
| Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Loan_ID | ||||||||||||
| LP001002 | Male | No | 0 | Graduate | No | 5849 | 0.0 | NaN | 360.0 | 1.0 | Urban | Y |
| LP001003 | Male | Yes | 1 | Graduate | No | 4583 | 1508.0 | 128.0 | 360.0 | 1.0 | Rural | N |
| LP001005 | Male | Yes | 0 | Graduate | Yes | 3000 | 0.0 | 66.0 | 360.0 | 1.0 | Urban | Y |
| LP001006 | Male | Yes | 0 | Not Graduate | No | 2583 | 2358.0 | 120.0 | 360.0 | 1.0 | Urban | Y |
| LP001008 | Male | No | 0 | Graduate | No | 6000 | 0.0 | 141.0 | 360.0 | 1.0 | Urban | Y |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| LP002978 | Female | No | 0 | Graduate | No | 2900 | 0.0 | 71.0 | 360.0 | 1.0 | Rural | Y |
| LP002979 | Male | Yes | 3+ | Graduate | No | 4106 | 0.0 | 40.0 | 180.0 | 1.0 | Rural | Y |
| LP002983 | Male | Yes | 1 | Graduate | No | 8072 | 240.0 | 253.0 | 360.0 | 1.0 | Urban | Y |
| LP002984 | Male | Yes | 2 | Graduate | No | 7583 | 0.0 | 187.0 | 360.0 | 1.0 | Urban | Y |
| LP002990 | Female | No | 0 | Graduate | Yes | 4583 | 0.0 | 133.0 | 360.0 | 0.0 | Semiurban | N |
614 rows × 12 columns
df_train.info()
<class 'pandas.core.frame.DataFrame'> Index: 614 entries, LP001002 to LP002990 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 601 non-null object 1 Married 611 non-null object 2 Dependents 599 non-null object 3 Education 614 non-null object 4 Self_Employed 582 non-null object 5 ApplicantIncome 614 non-null int64 6 CoapplicantIncome 614 non-null float64 7 LoanAmount 592 non-null float64 8 Loan_Amount_Term 600 non-null float64 9 Credit_History 564 non-null float64 10 Property_Area 614 non-null object 11 Loan_Status 614 non-null object dtypes: float64(4), int64(1), object(7) memory usage: 62.4+ KB
#converting categorical data to numeric
def change_to_numeric(column, *kwargs):
df_train[column] = df_train[column].replace(*kwargs)
return df_train
change_to_numeric('Gender',{"Male":1,"Female":0})
change_to_numeric('Married',{"No":0,"Yes":1})
change_to_numeric('Education',{"Graduate":1,"Not Graduate":0})
change_to_numeric('Self_Employed',{"Yes":1,"No":0})
change_to_numeric('Property_Area',{"Urban":0,"Rural":1,"Semiurban":2})
| Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Loan_ID | ||||||||||||
| LP001002 | 1.0 | 0.0 | 0 | 1 | 0.0 | 5849 | 0.0 | NaN | 360.0 | 1.0 | 0 | Y |
| LP001003 | 1.0 | 1.0 | 1 | 1 | 0.0 | 4583 | 1508.0 | 128.0 | 360.0 | 1.0 | 1 | N |
| LP001005 | 1.0 | 1.0 | 0 | 1 | 1.0 | 3000 | 0.0 | 66.0 | 360.0 | 1.0 | 0 | Y |
| LP001006 | 1.0 | 1.0 | 0 | 0 | 0.0 | 2583 | 2358.0 | 120.0 | 360.0 | 1.0 | 0 | Y |
| LP001008 | 1.0 | 0.0 | 0 | 1 | 0.0 | 6000 | 0.0 | 141.0 | 360.0 | 1.0 | 0 | Y |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| LP002978 | 0.0 | 0.0 | 0 | 1 | 0.0 | 2900 | 0.0 | 71.0 | 360.0 | 1.0 | 1 | Y |
| LP002979 | 1.0 | 1.0 | 3+ | 1 | 0.0 | 4106 | 0.0 | 40.0 | 180.0 | 1.0 | 1 | Y |
| LP002983 | 1.0 | 1.0 | 1 | 1 | 0.0 | 8072 | 240.0 | 253.0 | 360.0 | 1.0 | 0 | Y |
| LP002984 | 1.0 | 1.0 | 2 | 1 | 0.0 | 7583 | 0.0 | 187.0 | 360.0 | 1.0 | 0 | Y |
| LP002990 | 0.0 | 0.0 | 0 | 1 | 1.0 | 4583 | 0.0 | 133.0 | 360.0 | 0.0 | 2 | N |
614 rows × 12 columns
df_train.info()
<class 'pandas.core.frame.DataFrame'> Index: 614 entries, LP001002 to LP002990 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 601 non-null float64 1 Married 611 non-null float64 2 Dependents 599 non-null object 3 Education 614 non-null int64 4 Self_Employed 582 non-null float64 5 ApplicantIncome 614 non-null int64 6 CoapplicantIncome 614 non-null float64 7 LoanAmount 592 non-null float64 8 Loan_Amount_Term 600 non-null float64 9 Credit_History 564 non-null float64 10 Property_Area 614 non-null int64 11 Loan_Status 614 non-null object dtypes: float64(7), int64(3), object(2) memory usage: 62.4+ KB
for _ in df_train.columns:
print("{} has Na {}".format(_,df_train[_].isna().any()))
Gender has Na True Married has Na True Dependents has Na True Education has Na False Self_Employed has Na True ApplicantIncome has Na False CoapplicantIncome has Na False LoanAmount has Na True Loan_Amount_Term has Na True Credit_History has Na True Property_Area has Na False Loan_Status has Na False
counter = 0
for _ in df_train.columns:
data = df_train.loc[df_train[_].isna()].shape[0]
counter = counter + data
print("total missing data ", counter)
print("missing data in percentage {:.1f}%".format((counter/df_train.shape[0])*100))
total missing data 149 missing data in percentage 24.3%
change_to_numeric('Dependents',{"0":0,"1":1,"2":2,"3+":3})
change_to_numeric('Loan_Status',{"Y":1,"N":0})
| Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Loan_ID | ||||||||||||
| LP001002 | 1.0 | 0.0 | 0.0 | 1 | 0.0 | 5849 | 0.0 | NaN | 360.0 | 1.0 | 0 | 1 |
| LP001003 | 1.0 | 1.0 | 1.0 | 1 | 0.0 | 4583 | 1508.0 | 128.0 | 360.0 | 1.0 | 1 | 0 |
| LP001005 | 1.0 | 1.0 | 0.0 | 1 | 1.0 | 3000 | 0.0 | 66.0 | 360.0 | 1.0 | 0 | 1 |
| LP001006 | 1.0 | 1.0 | 0.0 | 0 | 0.0 | 2583 | 2358.0 | 120.0 | 360.0 | 1.0 | 0 | 1 |
| LP001008 | 1.0 | 0.0 | 0.0 | 1 | 0.0 | 6000 | 0.0 | 141.0 | 360.0 | 1.0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| LP002978 | 0.0 | 0.0 | 0.0 | 1 | 0.0 | 2900 | 0.0 | 71.0 | 360.0 | 1.0 | 1 | 1 |
| LP002979 | 1.0 | 1.0 | 3.0 | 1 | 0.0 | 4106 | 0.0 | 40.0 | 180.0 | 1.0 | 1 | 1 |
| LP002983 | 1.0 | 1.0 | 1.0 | 1 | 0.0 | 8072 | 240.0 | 253.0 | 360.0 | 1.0 | 0 | 1 |
| LP002984 | 1.0 | 1.0 | 2.0 | 1 | 0.0 | 7583 | 0.0 | 187.0 | 360.0 | 1.0 | 0 | 1 |
| LP002990 | 0.0 | 0.0 | 0.0 | 1 | 1.0 | 4583 | 0.0 | 133.0 | 360.0 | 0.0 | 2 | 0 |
614 rows × 12 columns
# checking for outlier
def outlier(vals):
dict_holder = {}
for column in vals.columns:
data = vals[column].to_numpy()
mean = np.mean(data)
std = np.std(data)
threshold_right = 3
threshold_left = -3
outlier = []
for i in data:
z = (i-mean)/std
if z > threshold_right or z < threshold_left:
if i not in outlier:
outlier.append(i)
dict_holder[column] = outlier
return dict_holder
X_train = df_train.drop('Loan_Status',axis=1)
y_train = df_train['Loan_Status']
outlier(X_train)
{'Gender': [],
'Married': [],
'Dependents': [],
'Education': [],
'Self_Employed': [],
'ApplicantIncome': [23803, 39999, 51763, 33846, 39147, 63337, 81000, 37719],
'CoapplicantIncome': [10968.0, 11300.0, 20000.0, 33837.0, 41667.0],
'LoanAmount': [],
'Loan_Amount_Term': [],
'Credit_History': [],
'Property_Area': []}
From outlier function we know that ApplicantIncome and CoapplicantIncome has outlier.
To solve that will be removed from the datasets.
#dropping the outlier
X_train = X_train[(X_train['ApplicantIncome'].isin([23803, 39999, 51763, 33846, 39147, 63337, 81000, 37719]) == False) & (X_train['CoapplicantIncome'].isin([10968.0, 11300.0, 20000.0, 33837.0, 41667.0]) == False)]
Code Above removed outlier record
loan_imputer_after_outlier = KNNImputer(n_neighbors=3)
df_loan_imputer_after_outlier = loan_imputer_after_outlier.fit_transform(X_train)
df_loan_imputer_after_outlier = pd.DataFrame(df_loan_imputer_after_outlier, columns=X_train.columns)
df_loan_imputer_after_outlier.describe()
| Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 600.000000 | 600.000000 | 600.000000 | 600.000000 | 600.000000 | 600.000000 | 600.000000 | 600.000000 | 600.000000 | 600.000000 | 600.000000 |
| mean | 0.811111 | 0.651111 | 0.745000 | 0.776667 | 0.137778 | 4870.668333 | 1421.538200 | 142.158889 | 342.533333 | 0.845556 | 1.048333 |
| std | 0.388418 | 0.476627 | 0.988701 | 0.416827 | 0.341169 | 3380.099718 | 1684.082008 | 77.648741 | 64.161593 | 0.352323 | 0.838950 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 150.000000 | 0.000000 | 9.000000 | 12.000000 | 0.000000 | 0.000000 |
| 25% | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 2880.500000 | 0.000000 | 100.000000 | 360.000000 | 1.000000 | 0.000000 |
| 50% | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 3768.500000 | 1188.500000 | 127.000000 | 360.000000 | 1.000000 | 1.000000 |
| 75% | 1.000000 | 1.000000 | 1.083333 | 1.000000 | 0.000000 | 5704.250000 | 2253.250000 | 164.083333 | 360.000000 | 1.000000 | 2.000000 |
| max | 1.000000 | 1.000000 | 3.000000 | 1.000000 | 1.000000 | 20833.000000 | 8980.000000 | 650.000000 | 480.000000 | 1.000000 | 2.000000 |
# scaling the data
scaler = MinMaxScaler()
scaled_data_Xtrain = pd.DataFrame(scaler.fit_transform(df_loan_imputer_after_outlier), columns=df_loan_imputer_after_outlier.columns, index=df_loan_imputer_after_outlier.index)
scaled_data_Xtrain
| Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 0.0 | 0.000000 | 1.0 | 0.0 | 0.275540 | 0.000000 | 0.201248 | 0.743590 | 1.0 | 0.0 |
| 1 | 1.0 | 1.0 | 0.333333 | 1.0 | 0.0 | 0.214331 | 0.167929 | 0.185647 | 0.743590 | 1.0 | 0.5 |
| 2 | 1.0 | 1.0 | 0.000000 | 1.0 | 1.0 | 0.137794 | 0.000000 | 0.088924 | 0.743590 | 1.0 | 0.0 |
| 3 | 1.0 | 1.0 | 0.000000 | 0.0 | 0.0 | 0.117633 | 0.262584 | 0.173167 | 0.743590 | 1.0 | 0.0 |
| 4 | 1.0 | 0.0 | 0.000000 | 1.0 | 0.0 | 0.282841 | 0.000000 | 0.205928 | 0.743590 | 1.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 595 | 0.0 | 0.0 | 0.000000 | 1.0 | 0.0 | 0.132959 | 0.000000 | 0.096724 | 0.743590 | 1.0 | 0.5 |
| 596 | 1.0 | 1.0 | 1.000000 | 1.0 | 0.0 | 0.191268 | 0.000000 | 0.048362 | 0.358974 | 1.0 | 0.5 |
| 597 | 1.0 | 1.0 | 0.333333 | 1.0 | 0.0 | 0.383020 | 0.026726 | 0.380655 | 0.743590 | 1.0 | 0.0 |
| 598 | 1.0 | 1.0 | 0.666667 | 1.0 | 0.0 | 0.359377 | 0.000000 | 0.277691 | 0.743590 | 1.0 | 0.0 |
| 599 | 0.0 | 0.0 | 0.000000 | 1.0 | 1.0 | 0.214331 | 0.000000 | 0.193448 | 0.743590 | 0.0 | 1.0 |
600 rows × 11 columns
Table Above showing the scaled dataset using MinMaxScaler
fig = go.Figure()
fig.add_trace(go.Box(x=scaled_data_Xtrain['ApplicantIncome']))
fig.add_trace(go.Box(x=scaled_data_Xtrain['CoapplicantIncome']))
fig.add_trace(go.Box(x=scaled_data_Xtrain['LoanAmount']))
#shapiro normal test
tampungan = []
for _ in X_train.columns:
tampungan.append(shapiro(scaled_data_Xtrain[_])[1])
print("Sig {} = {}".format(_,shapiro(scaled_data_Xtrain[_])[1]))
print("Avg p val:", statistics.mean(tampungan))
Sig Gender = 2.542326758378264e-38 Sig Married = 7.659532669395045e-35 Sig Dependents = 9.334702578749888e-30 Sig Education = 1.5582305519678162e-37 Sig Self_Employed = 4.851673634077884e-40 Sig ApplicantIncome = 8.280172878469025e-29 Sig CoapplicantIncome = 8.615789046511415e-26 Sig LoanAmount = 3.7817648980765205e-27 Sig Loan_Amount_Term = 1.5172599760132422e-38 Sig Credit_History = 4.100562242916675e-39 Sig Property_Area = 1.5124788150529672e-27 Avg p val: 8.322206426036674e-27
plt.figure(figsize=(18,14))
sns.heatmap(scaled_data_Xtrain.corr(), xticklabels = scaled_data_Xtrain.columns, yticklabels = scaled_data_Xtrain.columns, annot=True,
linewidths=0.5, cmap = "YlGnBu")
<AxesSubplot:>
def optimize_k(data, target):
rmse = lambda y, yhat: np.sqrt(mean_squared_error(y, yhat))
errors = []
for k in range(2, 22):
imputer = KNNImputer(n_neighbors=k)
imputed = imputer.fit_transform(data)
df_imputed = pd.DataFrame(imputed, columns=data.columns)
X = df_imputed.drop(target, axis=1)
y = df_imputed[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_test)
error = rmse(y_test, preds)
errors.append({'K': k, 'RMSE': error})
return errors
optimum_k_wine = optimize_k(data=df_train, target='Loan_Status')
optimum_k_wine
[{'K': 2, 'RMSE': 0.4685212856658182},
{'K': 3, 'RMSE': 0.4685212856658182},
{'K': 4, 'RMSE': 0.4771187236136979},
{'K': 5, 'RMSE': 0.4685212856658182},
{'K': 6, 'RMSE': 0.4685212856658182},
{'K': 7, 'RMSE': 0.4685212856658182},
{'K': 8, 'RMSE': 0.4597631061983315},
{'K': 9, 'RMSE': 0.48556395842728817},
{'K': 10, 'RMSE': 0.4771187236136979},
{'K': 11, 'RMSE': 0.4685212856658182},
{'K': 12, 'RMSE': 0.4597631061983315},
{'K': 13, 'RMSE': 0.4685212856658182},
{'K': 14, 'RMSE': 0.4597631061983315},
{'K': 15, 'RMSE': 0.4685212856658182},
{'K': 16, 'RMSE': 0.4597631061983315},
{'K': 17, 'RMSE': 0.4685212856658182},
{'K': 18, 'RMSE': 0.4685212856658182},
{'K': 19, 'RMSE': 0.4771187236136979},
{'K': 20, 'RMSE': 0.4771187236136979},
{'K': 21, 'RMSE': 0.4771187236136979}]
results = list()
strategies = [str(i) for i in [1,3,5,7,9,15,18,21]]
for s in strategies:
# create the modeling pipeline
pipeline = Pipeline(steps=[('i', KNNImputer(n_neighbors=int(s))), ('m', RandomForestClassifier())])
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, df_train.drop("Loan_Status",axis=1), df_train['Loan_Status'], scoring='accuracy', cv=cv, n_jobs=-1)
# store results
results.append(scores)
print('>%s %.3f (%.3f)' % (s, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=strategies, showmeans=True)
plt.show()
>1 0.774 (0.047) >3 0.783 (0.042) >5 0.784 (0.042) >7 0.791 (0.041) >9 0.782 (0.044) >15 0.787 (0.037) >18 0.785 (0.043) >21 0.786 (0.040)
from the graph above we can see that 7 neighbors shows better accuracy
#imputing data
loan_imputer = KNNImputer(n_neighbors=7)
df_loan_imputer = loan_imputer.fit_transform(df_train.drop("Loan_Status",axis=1))
df_loan_imputer = np.c_[df_loan_imputer, df_train['Loan_Status']]
df_loan_imputer = pd.DataFrame(df_loan_imputer, columns=df_train.columns)
df_loan_imputer.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 614 entries, 0 to 613 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 614 non-null float64 1 Married 614 non-null float64 2 Dependents 614 non-null float64 3 Education 614 non-null float64 4 Self_Employed 614 non-null float64 5 ApplicantIncome 614 non-null float64 6 CoapplicantIncome 614 non-null float64 7 LoanAmount 614 non-null float64 8 Loan_Amount_Term 614 non-null float64 9 Credit_History 614 non-null float64 10 Property_Area 614 non-null float64 11 Loan_Status 614 non-null float64 dtypes: float64(12) memory usage: 57.7 KB
for _ in df_train.columns.drop("Loan_Status"):
print("{} has Na {}".format(_,df_loan_imputer[_].isna().any()))
Gender has Na False Married has Na False Dependents has Na False Education has Na False Self_Employed has Na False ApplicantIncome has Na False CoapplicantIncome has Na False LoanAmount has Na False Loan_Amount_Term has Na False Credit_History has Na False Property_Area has Na False
outlier_json = outlier(df_loan_imputer.drop("Loan_Status", axis=1))
outlier_json
{'Gender': [],
'Married': [],
'Dependents': [],
'Education': [],
'Self_Employed': [],
'ApplicantIncome': [23803.0,
39999.0,
51763.0,
33846.0,
39147.0,
63337.0,
81000.0,
37719.0],
'CoapplicantIncome': [10968.0, 11300.0, 20000.0, 33837.0, 41667.0],
'LoanAmount': [650.0,
600.0,
700.0,
495.0,
436.0,
446.42857142857144,
480.0,
490.0,
570.0,
405.0,
500.0,
496.0],
'Loan_Amount_Term': [120.0, 60.0, 36.0, 84.0, 12.0],
'Credit_History': [],
'Property_Area': []}
#dropping the outlier
df_loan_imputer_before_outlier = df_loan_imputer[(df_loan_imputer['ApplicantIncome'].isin(outlier_json["ApplicantIncome"]) == False)\
& (df_loan_imputer['CoapplicantIncome'].isin(outlier_json['CoapplicantIncome']) == False)\
& (df_loan_imputer['LoanAmount'].isin(outlier_json['LoanAmount']) == False)\
& (df_loan_imputer['Loan_Amount_Term'].isin(outlier_json['Loan_Amount_Term']) == False)
]
df_loan_imputer_before_outlier.head()
| Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 5849.0 | 0.0 | 160.285714 | 360.0 | 1.0 | 0.0 | 1.0 |
| 1 | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 | 4583.0 | 1508.0 | 128.000000 | 360.0 | 1.0 | 1.0 | 0.0 |
| 2 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 | 3000.0 | 0.0 | 66.000000 | 360.0 | 1.0 | 0.0 | 1.0 |
| 3 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 2583.0 | 2358.0 | 120.000000 | 360.0 | 1.0 | 0.0 | 1.0 |
| 4 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 6000.0 | 0.0 | 141.000000 | 360.0 | 1.0 | 0.0 | 1.0 |
for vals in df_loan_imputer_before_outlier.columns:
if vals == "ApplicantIncome" or vals == "CoapplicantIncome":
continue
print("unique val for {} are {}".format(vals, df_loan_imputer_before_outlier[vals].unique()))
unique val for Gender are [1. 0. 0.71428571 0.85714286 0.57142857] unique val for Married are [0. 1. 0.57142857] unique val for Dependents are [0. 1. 2. 3. 0.57142857 1.28571429 0.42857143 0.85714286 1.14285714 1.42857143 0.71428571] unique val for Education are [1. 0.] unique val for Self_Employed are [0. 1. 0.14285714 0.28571429 0.42857143 0.71428571] unique val for LoanAmount are [160.28571429 128. 66. 120. 141. 267. 95. 158. 168. 70. 109. 200. 114. 125. 100. 76. 133. 115. 104. 315. 116. 112. 151. 191. 122. 110. 35. 201. 74. 106. 320. 97.57142857 144. 184. 80. 47. 75. 134. 96. 88. 44. 286. 97. 135. 180. 99. 165. 126.57142857 258. 126. 312. 136. 172. 81. 187. 113. 176. 130. 111. 55.14285714 167. 265. 50. 210. 175. 131. 188. 156.14285714 137. 210.57142857 182.71428571 160. 225. 216. 94. 139. 152. 148.28571429 118. 185. 154. 85. 259. 194. 93. 130.14285714 182. 102. 290. 84. 242. 129. 30. 244. 255. 98. 275. 121. 63. 87. 101. 67. 73. 108. 58. 48. 164. 170. 83. 90. 166. 86. 124. 55. 59. 127. 214. 240. 72. 60. 138. 42. 280. 140. 123. 279. 192. 304. 330. 150. 207. 78. 54. 89. 143. 105. 132. 67.85714286 56. 159. 155. 300. 376. 117. 71. 64.28571429 173. 46. 228. 308. 236. 380. 296. 156. 103. 45. 65. 53. 62. 218. 178. 195.28571429 91.71428571 148. 190. 149. 153. 162. 230. 234. 246. 186. 119. 107. 209. 208. 243. 40. 250. 311. 159.85714286 400. 161. 196. 324. 157. 145. 152.57142857 143.14285714 181. 26. 260. 211. 9. 205. 36. 61. 146. 292. 142. 253. ] unique val for Loan_Amount_Term are [360. 240. 308.57142857 325.71428571 313.71428571 180. 300. 377.14285714 480. 334.28571429 368.57142857] unique val for Credit_History are [1. 0. 0.85714286 0.71428571 0.57142857] unique val for Property_Area are [0. 1. 2.] unique val for Loan_Status are [1. 0.]
imputation data showing regression value. next code will be rounding the prediction value so it can match with the datasets
df_loan_imputer_before_outlier_rounded = df_loan_imputer_before_outlier.apply(np.ceil)
for vals in df_loan_imputer_before_outlier.columns:
if vals == "ApplicantIncome" or vals == "CoapplicantIncome":
continue
print("unique val for {} are {}".format(vals, df_loan_imputer_before_outlier_rounded[vals].unique()))
unique val for Gender are [1. 0.] unique val for Married are [0. 1.] unique val for Dependents are [0. 1. 2. 3.] unique val for Education are [1. 0.] unique val for Self_Employed are [0. 1.] unique val for LoanAmount are [161. 128. 66. 120. 141. 267. 95. 158. 168. 70. 109. 200. 114. 125. 100. 76. 133. 115. 104. 315. 116. 112. 151. 191. 122. 110. 35. 201. 74. 106. 320. 98. 144. 184. 80. 47. 75. 134. 96. 88. 44. 286. 97. 135. 180. 99. 165. 127. 258. 126. 312. 136. 172. 81. 187. 113. 176. 130. 111. 56. 167. 265. 50. 210. 175. 131. 188. 157. 137. 211. 183. 160. 225. 216. 94. 139. 152. 149. 118. 185. 154. 85. 259. 194. 93. 182. 102. 290. 84. 242. 129. 30. 244. 255. 275. 121. 63. 87. 101. 67. 73. 108. 58. 48. 164. 170. 83. 90. 166. 86. 124. 55. 59. 214. 240. 72. 60. 138. 42. 280. 140. 123. 279. 192. 304. 330. 150. 207. 78. 54. 89. 143. 105. 132. 68. 159. 155. 300. 376. 117. 71. 65. 173. 46. 228. 308. 236. 380. 296. 156. 103. 45. 53. 62. 218. 178. 196. 92. 148. 190. 153. 162. 230. 234. 246. 186. 119. 107. 209. 208. 243. 40. 250. 311. 400. 324. 145. 181. 26. 260. 9. 205. 36. 61. 146. 292. 142. 253.] unique val for Loan_Amount_Term are [360. 240. 309. 326. 314. 180. 300. 378. 480. 335. 369.] unique val for Credit_History are [1. 0.] unique val for Property_Area are [0. 1. 2.] unique val for Loan_Status are [1. 0.]
since the impute before outlier shows better performance by the avg of the p value tested using shapiro with higher value compared to imputed after outlier, in this section only show scaled SMOTE sample. for the raw you can change the variable name from "oversampled" to "df_loan_imputer_before_outlier_rounded".
# scaled the data
scaler = MinMaxScaler()
scaled_data_Xtrain_before_outlier = pd.DataFrame(scaler.fit_transform(oversampled.drop("Loan_Status",axis=1)), columns=oversampled.columns.drop("Loan_Status"), index=oversampled.index)
X = scaled_data_Xtrain_before_outlier
y = oversampled['Loan_Status']
fig = go.Figure()
fig.add_trace(go.Box(x=scaled_data_Xtrain_before_outlier['ApplicantIncome']))
fig.add_trace(go.Box(x=scaled_data_Xtrain_before_outlier['CoapplicantIncome']))
fig.add_trace(go.Box(x=scaled_data_Xtrain_before_outlier['LoanAmount']))
fig.add_trace(go.Box(x=scaled_data_Xtrain_before_outlier['Loan_Amount_Term']))
the graph above showing SMOTE data sampling distribution, there are still outlier in the data but we will keep use it to train to the model since we wanna compared it to the raw model with the same data condition.
#shapiro normal test
tampungan = []
for _ in scaled_data_Xtrain_before_outlier.columns:
tampungan.append(shapiro(scaled_data_Xtrain_before_outlier[_])[1])
print("Sig {} = {}".format(_,shapiro(scaled_data_Xtrain_before_outlier[_])[1]))
print("Avg p val:", statistics.mean(tampungan))
Sig Gender = 5.774750971482571e-42 Sig Married = 2.447818649737174e-37 Sig Dependents = 6.032808042848024e-32 Sig Education = 7.0626843900435105e-40 Sig Self_Employed = 3.44719422223905e-43 Sig ApplicantIncome = 2.3635308690175662e-29 Sig CoapplicantIncome = 1.3495359454423226e-28 Sig LoanAmount = 1.6527847222561058e-18 Sig Loan_Amount_Term = 1.4423565093295342e-41 Sig Credit_History = 9.162600203757465e-40 Sig Property_Area = 3.6917154435212704e-28 Avg p val: 1.502531566167206e-19
plt.figure(figsize=(18,14))
sns.heatmap(scaled_data_Xtrain_before_outlier.corr(), xticklabels = scaled_data_Xtrain_before_outlier.columns, yticklabels = scaled_data_Xtrain_before_outlier.columns, annot=True,
linewidths=0.5, cmap = "YlGnBu")
<AxesSubplot:>
#splitting test and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
lr = LogisticRegression()
lr_clf = lr.fit(X_train,y_train)
y_pred = lr_clf.predict(X_test)
clf_metrics = metrics.confusion_matrix(y_test, y_pred)
print("TP : {}, FN : {}, FP : {}, TN : {}".format(clf_metrics[0][0],clf_metrics[0][1],clf_metrics[1][0],clf_metrics[1][1]))
print("Recall : {}".format(clf_metrics[0][0]/(clf_metrics[0][0]+clf_metrics[0][1])))
TP : 30, FN : 30, FP : 1, TN : 130 Recall : 0.5
clf_report = metrics.classification_report(y_test,y_pred)
print(clf_report)
precision recall f1-score support
0.0 0.97 0.50 0.66 60
1.0 0.81 0.99 0.89 131
accuracy 0.84 191
macro avg 0.89 0.75 0.78 191
weighted avg 0.86 0.84 0.82 191
dtc = DecisionTreeClassifier()
dtc_clf = dtc.fit(X_train, y_train)
y_pred = dtc_clf.predict(X_test)
clf_metrics = metrics.confusion_matrix(y_test, y_pred)
print("TP : {}, FN : {}, FP : {}, TN : {}".format(clf_metrics[0][0],clf_metrics[0][1],clf_metrics[1][0],clf_metrics[1][1]))
print("Recall : {}".format(clf_metrics[0][0]/(clf_metrics[0][0]+clf_metrics[0][1])))
TP : 38, FN : 22, FP : 28, TN : 103 Recall : 0.6333333333333333
clf_report = metrics.classification_report(y_test,y_pred)
print(clf_report)
precision recall f1-score support
0.0 0.58 0.63 0.60 60
1.0 0.82 0.79 0.80 131
accuracy 0.74 191
macro avg 0.70 0.71 0.70 191
weighted avg 0.75 0.74 0.74 191
knn = KNeighborsClassifier()
knn_clf = knn.fit(X_train, y_train)
y_pred = knn_clf.predict(X_test)
clf_metrics = metrics.confusion_matrix(y_test, y_pred)
print("TP : {}, FN : {}, FP : {}, TN : {}".format(clf_metrics[0][0],clf_metrics[0][1],clf_metrics[1][0],clf_metrics[1][1]))
print("Recall : {}".format(clf_metrics[0][0]/(clf_metrics[0][0]+clf_metrics[0][1])))
TP : 32, FN : 28, FP : 8, TN : 123 Recall : 0.5333333333333333
clf_report = metrics.classification_report(y_test,y_pred)
print(clf_report)
precision recall f1-score support
0.0 0.80 0.53 0.64 60
1.0 0.81 0.94 0.87 131
accuracy 0.81 191
macro avg 0.81 0.74 0.76 191
weighted avg 0.81 0.81 0.80 191
svc = SVC()
svc_clf = svc.fit(X_train, y_train)
y_pred = svc_clf.predict(X_test)
clf_metrics = metrics.confusion_matrix(y_test, y_pred)
print("TP : {}, FN : {}, FP : {}, TN : {}".format(clf_metrics[0][0],clf_metrics[0][1],clf_metrics[1][0],clf_metrics[1][1]))
print("Recall : {}".format(clf_metrics[0][0]/(clf_metrics[0][0]+clf_metrics[0][1])))
TP : 30, FN : 30, FP : 1, TN : 130 Recall : 0.5
clf_report = metrics.classification_report(y_test,y_pred)
print(clf_report)
precision recall f1-score support
0.0 0.97 0.50 0.66 60
1.0 0.81 0.99 0.89 131
accuracy 0.84 191
macro avg 0.89 0.75 0.78 191
weighted avg 0.86 0.84 0.82 191
From above raw model classifier we know that for logistic regression, KNN, SVC shows the model tending to approve loan because the data has more data with approved loan.
Decision tree model shows us better result for the balance in recall but with lower accuracy value and still has the higher value on the approved loan.
Since the model not quite good, in this notebook will do optimizing the data to handle the imbalance such as feature engineering to support the model.
# univariate analysis
figure, axis = plt.subplots(2, 3, figsize=(18,14))
sns.histplot(y_train, ax=axis[0][0])
sns.histplot(X_train.Married, ax=axis[0][1])
sns.histplot(X_train.Gender, ax=axis[1][0])
sns.histplot(X_train.Education, ax=axis[1][1])
sns.histplot(X_train.Self_Employed, ax=axis[1][2])
sns.histplot(X_train.Property_Area, ax=axis[0][2])
plt.show()
From histplot above we can see that:
# univariate analysis
figure, axis = plt.subplots(2, 3, figsize=(18,14))
sns.histplot(df_train, x="Married", hue="Loan_Status", multiple="dodge", ax=axis[0][0])
sns.histplot(df_train, x="Gender", hue="Loan_Status", multiple="dodge", ax=axis[0][1])
sns.histplot(df_train, x="Education", hue="Loan_Status", multiple="dodge", ax=axis[1][0])
sns.histplot(df_train, x="Self_Employed", hue="Loan_Status", multiple="dodge", ax=axis[1][1])
sns.histplot(df_train, x="Property_Area", hue="Loan_Status", multiple="dodge",ax=axis[1][2])
sns.histplot(df_train, x="Credit_History", hue="Loan_Status", multiple="dodge", ax=axis[0][2])
plt.show()
From histplot above we know that data with approved loan status has more record in the datasets compared with rejected loan status, this kind of condition possible to make bias for the machine learning model. So we will optimze the data before we create the machine learning to lower the bias
plt.figure(figsize=(18,14))
sns.histplot(df_loan_imputer_before_outlier_rounded, x="Dependents", hue="Married", multiple="dodge")
<AxesSubplot:xlabel='Dependents', ylabel='Count'>
from the graph above we know that 0 dependents has higher values with non married, make sense since non married more probably had no children. But there are data shows us that married has dependents too with mostly has 2 dependents and followed by 1 dependents the lower values at >= 3 dependents.
plt.figure(figsize=(18,14))
sns.catplot(x="Education", y="CoapplicantIncome", hue="Loan_Status",
kind="boxen", data=df_loan_imputer_before_outlier_rounded, col="Property_Area")
<seaborn.axisgrid.FacetGrid at 0x192c7246070>
<Figure size 1296x1008 with 0 Axes>
The figure shows us that approved loan has more distribute higher coapplicant income compare with rejected loan in rural and semiurban area. But for the urban property are (left figure) for the graduated data, rejected loan has more higher distributed data compare to approved loan.
plt.figure(figsize=(18,14))
sns.catplot(x="Education", y="ApplicantIncome", hue="Loan_Status",
kind="boxen", data=df_loan_imputer_before_outlier_rounded, col="Property_Area")
<seaborn.axisgrid.FacetGrid at 0x192c6e30940>
<Figure size 1296x1008 with 0 Axes>
from the data we know that:
fig = px.scatter(df_loan_imputer_before_outlier_rounded, x="ApplicantIncome", y="CoapplicantIncome", color="Loan_Status")
fig.show()
from the graph above shows that Income (applicant or coapplicant) doesnt had signifance correlation to determine whether loan status is Y or N.
sm = SMOTE(sampling_strategy='minority', random_state=42)
oversampled_X, oversampled_Y = sm.fit_resample(df_loan_imputer_before_outlier_rounded.drop('Loan_Status',axis=1), df_loan_imputer_before_outlier_rounded['Loan_Status'])
oversampled = pd.concat([pd.DataFrame(oversampled_Y), pd.DataFrame(oversampled_X)],axis=1)
oversampled['Loan_Status'].value_counts()
0.0 399 1.0 399 Name: Loan_Status, dtype: int64
from the result above we already had the balance target class and it will be use to the model to see if there any better perfomance compare to raw model.
lr = LogisticRegression()
lr_clf = lr.fit(X_train,y_train)
y_pred = lr_clf.predict(X_test)
clf_metrics = metrics.confusion_matrix(y_test, y_pred)
print("TP : {}, FN : {}, FP : {}, TN : {}".format(clf_metrics[0][0],clf_metrics[0][1],clf_metrics[1][0],clf_metrics[1][1]))
print("Recall : {}\n".format(clf_metrics[0][0]/(clf_metrics[0][0]+clf_metrics[0][1])))
clf_report = metrics.classification_report(y_test,y_pred)
print(clf_report)
TP : 73, FN : 62, FP : 2, TN : 127
Recall : 0.5407407407407407
precision recall f1-score support
0.0 0.97 0.54 0.70 135
1.0 0.67 0.98 0.80 129
accuracy 0.76 264
macro avg 0.82 0.76 0.75 264
weighted avg 0.83 0.76 0.75 264
dtc = DecisionTreeClassifier()
dtc_clf = dtc.fit(X_train, y_train)
y_pred = dtc_clf.predict(X_test)
clf_metrics = metrics.confusion_matrix(y_test, y_pred)
print("TP : {}, FN : {}, FP : {}, TN : {}".format(clf_metrics[0][0],clf_metrics[0][1],clf_metrics[1][0],clf_metrics[1][1]))
print("Recall : {}\n".format(clf_metrics[0][0]/(clf_metrics[0][0]+clf_metrics[0][1])))
clf_report = metrics.classification_report(y_test,y_pred)
print(clf_report)
TP : 103, FN : 32, FP : 26, TN : 103
Recall : 0.762962962962963
precision recall f1-score support
0.0 0.80 0.76 0.78 135
1.0 0.76 0.80 0.78 129
accuracy 0.78 264
macro avg 0.78 0.78 0.78 264
weighted avg 0.78 0.78 0.78 264
knn = KNeighborsClassifier()
knn_clf = knn.fit(X_train, y_train)
y_pred = knn_clf.predict(X_test)
clf_metrics = metrics.confusion_matrix(y_test, y_pred)
print("TP : {}, FN : {}, FP : {}, TN : {}".format(clf_metrics[0][0],clf_metrics[0][1],clf_metrics[1][0],clf_metrics[1][1]))
print("Recall : {}\n".format(clf_metrics[0][0]/(clf_metrics[0][0]+clf_metrics[0][1])))
clf_report = metrics.classification_report(y_test,y_pred)
print(clf_report)
TP : 83, FN : 52, FP : 26, TN : 103
Recall : 0.6148148148148148
precision recall f1-score support
0.0 0.76 0.61 0.68 135
1.0 0.66 0.80 0.73 129
accuracy 0.70 264
macro avg 0.71 0.71 0.70 264
weighted avg 0.71 0.70 0.70 264
svc = SVC()
svc_clf = svc.fit(X_train, y_train)
y_pred = svc_clf.predict(X_test)
clf_metrics = metrics.confusion_matrix(y_test, y_pred)
print("TP : {}, FN : {}, FP : {}, TN : {}".format(clf_metrics[0][0],clf_metrics[0][1],clf_metrics[1][0],clf_metrics[1][1]))
print("Recall : {}\n".format(clf_metrics[0][0]/(clf_metrics[0][0]+clf_metrics[0][1])))
clf_report = metrics.classification_report(y_test,y_pred)
print(clf_report)
TP : 72, FN : 63, FP : 3, TN : 126
Recall : 0.5333333333333333
precision recall f1-score support
0.0 0.96 0.53 0.69 135
1.0 0.67 0.98 0.79 129
accuracy 0.75 264
macro avg 0.81 0.76 0.74 264
weighted avg 0.82 0.75 0.74 264
raw model machine learning :
This notebook shows for raw model has better value in accuracy, but not quite good in the f1-score since raw model more likely to overfit in the approved loan influence by the data has more approved loan.
After using SMOTE to handle the imbalance data shows us better performance in the f1-score since the SMOTE mdoe has lower gap avg value, but the performance of accuracy lower compared to raw model.